12 seconds to load

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import plotly.express as px
import plotly.graph_objects as go
import folium
from folium import plugins
from datetime import datetime
import os
import json
import requests
import time
In [2]:
APP_TOKEN = "bX9qcIUkfVdP5XgCDw4bXmwSt"
SECRET_TOKEN = "xJxlixLhDkJY7TvdRoCYOCY3cVThZ3z7y6QE"

base_url = "https://data.cityofnewyork.us/resource/h9gi-nx95.json?$$app_token={}".format(APP_TOKEN)
url = base_url + "{}" 
cnt_url = base_url + "{}{}" # select , where
In [3]:
where_inj = "&$where=number_of_cyclist_injured>0.0&$limit=50000"
where_kill = "&$where=number_of_cyclist_killed>0.0"
In [4]:
inj_df = pd.read_json(url.format(where_inj))
killed_df = pd.read_json(url.format(where_kill))
In [5]:
def dt(date,time):
    date = pd.to_datetime(date).dt.date
    time = pd.to_datetime(time).dt.time
    return date,time
In [6]:
killed_df.accident_date, killed_df.accident_time = dt(killed_df.accident_date, 
                                                      killed_df.accident_time)
inj_df.accident_date, inj_df.accident_time = dt(inj_df.accident_date,
                                                inj_df.accident_time)
In [7]:
df = (pd
      .concat([inj_df,killed_df])
      .drop(columns='location')
      .drop_duplicates()
      .reset_index(drop=True))
df.vehicle_type_code1 = df.vehicle_type_code1.apply(lambda x :str(x).upper())
df.vehicle_type_code2 = df.vehicle_type_code2.apply(lambda x :str(x).upper())
In [8]:
df['Accident Year'] = df.accident_date.apply(lambda x: x.year)
df['Accident Month'] = df.accident_date.apply(lambda x: x.month)
df['Accident Hour'] = df.accident_time.apply(lambda x: x.hour)
In [9]:
def create_df(group):
    return (df
            .groupby(group)
            .collision_id.count()
            .reset_index()
            .rename(columns={'collision_id':'Number of Accidents'})
           )
In [10]:
crash_mo_yr = create_df(['Accident Year','Accident Month'])
In [11]:
crash_hr = create_df('Accident Hour')
In [12]:
crash_mo_hr = create_df(['Accident Month','Accident Hour'])
In [13]:
killed_df['accident_year'] = killed_df.accident_date.apply(lambda x: x.year)
killed_df['accident_month'] = killed_df.accident_date.apply(lambda x: x.month)
killed_df['accident_hr'] = killed_df.accident_time.apply(lambda x: x.hour)
In [14]:
mo_fig = px.area(crash_mo_yr, x="Accident Month", y="Number of Accidents",animation_frame="Accident Year",
                 range_y=[0,800], range_x=[1,12])
mo_fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1000
mo_fig.layout.title = "Bicycle Accidents by Month for Each Year"
mo_fig.show()
In [15]:
hr_fig = px.area(crash_mo_hr, x="Accident Hour", y="Number of Accidents",animation_frame="Accident Month",
                 range_y=[0,400], range_x=[0,23])
hr_fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1000
hr_fig.layout.title = "Bicycle Accidents by Hour For Each Month"
hr_fig.show()
In [16]:
fdir = './agg_trip'
agg_files = os.listdir(fdir)
agg_df = pd.read_csv(fdir + '/' + agg_files[0]).iloc[:,[0,1]]
In [17]:
agg_df.head()
Out[17]:
Date Trips over the past 24-hours (midnight to 11:59pm)
0 7/1/18 45265
1 7/2/18 58633
2 7/3/18 54882
3 7/4/18 48368
4 7/5/18 59471
In [18]:
for i in range(1,len(agg_files)):
    agg_df = agg_df.append(pd.read_csv(fdir + '/' + agg_files[i]).iloc[:,[0,1]])
agg_df.Date = pd.to_datetime(agg_df.Date).dt.date
agg_df = agg_df.rename(columns={'Trips over the past 24-hours (midnight to 11:59pm)':'Number of Trips'})
agg_df = agg_df.sort_values('Date')
In [19]:
fig = px.line(agg_df, x='Date', y='Number of Trips', title="Number of CitiBike Trips by Day", hover_name='Date')
fig.show()
GeoJSON takes longitude, latitude. Not the other way around. Soo annoying

route data

In [20]:
trip_df = pd.read_csv('201909-citibike-tripdata.csv')
trip_df['date'] = pd.to_datetime(trip_df.starttime).dt.date
trip_df.date = trip_df.date.apply(str)
trip_df.groupby('date').bikeid.count().reset_index().sort_values('bikeid',ascending=False).head()
high_day = trip_df[trip_df.date == '2019-09-25']

coord092519 = high_day[['start station name','start station id','start station latitude',
                'start station longitude','end station name','end station id',
                'end station latitude',
                'end station longitude']].copy()
coord092519['id'] = (coord092519['start station name'] +
                      coord092519['end station name'])
coord092519 = coord092519.groupby(['start station name','start station id','start station latitude',
                                   'start station longitude',
                                   'end station name','end station id','end station latitude',
                                  'end station longitude']).id.count().reset_index()

coord092519['filt'] = coord092519.apply(lambda x: 'y' if x['start station name'] == x['end station name'] else '',
                                       axis=1)
coord092519 = coord092519[coord092519.filt != 'y'].reset_index(drop=True)
In [21]:
cohort = coord092519[coord092519.id >= 4]
cohort = cohort.rename(columns={'id':'count'})
cohort['id'] = cohort['start station id'].apply(str) + '-' + cohort['end station id'].apply(str)
In [22]:
routes = pd.read_csv('backup_route_file.csv')
routes = routes[routes.geojson != '{"message":"Too Many Requests"}'].reset_index(drop=True)
In [23]:
cohort_df = pd.merge(cohort,routes[['id','geojson']],on='id',how='inner')
cohort_df = cohort_df[['geojson']].drop_duplicates()

geojson = list(cohort_df.geojson)
gjson = []
for i in range(len(geojson)):
    gjson.append(json.loads(geojson[i])['routes'][0]['geometry']['coordinates'])
    
for i in gjson:
    for j in i:
        j.reverse()

accident data

In [24]:
loc_df = df[['latitude','longitude','on_street_name','off_street_name','accident_date']].copy()
loc_df = loc_df[(pd.isna(loc_df.latitude) == False) & 
                (loc_df.latitude != 0) &
                (loc_df.longitude != 0)
               ]
loc_df.on_street_name = loc_df.on_street_name.str.strip()
loc_df.off_street_name = loc_df.off_street_name.str.strip()
loc_df.accident_date = loc_df.accident_date.apply(str)
loc_df['lat_lon_list'] = loc_df.apply(lambda x: [x.longitude,x.latitude], axis=1)
loc_df = loc_df.sort_values('accident_date').reset_index(drop=True)
In [25]:
features = [
    {
        'type': 'Feature',
        'geometry': {
            'type': 'MultiPoint',
            'coordinates': list(loc_df.lat_lon_list),
        },
        'properties': {
            'times': list(loc_df.accident_date),
            'icon': 'circle',
            'iconstyle': {
                'fillColor': 'red',
                'fillOpacity': 0.5,
                'stroke': 'false',
                'radius': 5
            },
            'style': {'weight': 0.5}
        }
    }
]
In [113]:
nyc_map = folium.Map(location=[40.735,-73.95],zoom_start=11.5, tiles='cartodbdark_matter')
folium.PolyLine(gjson, weight=1, opacity=0.2).add_to(folium
                                                     .FeatureGroup(name='Routes')
                                                     .add_to(nyc_map))

plugins.TimestampedGeoJson(
            {
                'type': 'FeatureCollection',
                'features': features
            },
            period='P1M',
            add_last_point=True,
            auto_play=True,
            loop=False,
            max_speed=2,
            loop_button=True,
            date_options='YYYY-MM-DD',
            time_slider_drag_update=True,
            duration='P1M'
        ).add_to(nyc_map)

folium.LayerControl().add_to(nyc_map)
nyc_map.save('test.html')
nyc_map
Out[113]:

Excluding bike on bike accidents and accidents involving 3 or more parties. These types of incidents make up less than 5% of the cohort

In [24]:
bike_list = ['BIKE','BICYCLE','E-BIK','BICYCLE','BYCIC']
cause_df = df[((pd.isna(df.contributing_factor_vehicle_3) == True) & 
               ((df.vehicle_type_code1.isin(bike_list) == True) | 
                (df.vehicle_type_code2.isin(bike_list) == True))
              )]

cause_df = cause_df[(cause_df.vehicle_type_code1.isin(bike_list) == False) | 
                    (cause_df.vehicle_type_code2.isin(bike_list) == False)]

def bike_cause(x):
    if x.vehicle_type_code1 in bike_list:
        return x.contributing_factor_vehicle_1
    else:
        return x.contributing_factor_vehicle_2

def veh_cause(x):
    if x.vehicle_type_code1 not in bike_list:
        return x.contributing_factor_vehicle_1
    else:
        return x.contributing_factor_vehicle_2

cause_df['bike_cause'] = cause_df.apply(bike_cause, axis=1)
cause_df['veh_cause'] = cause_df.apply(veh_cause, axis=1)
# cause_df = cause_df[['collision_id','bike_cause', 'veh_cause']]
In [25]:
bike_cause_df = (cause_df
                 .groupby('bike_cause')
                 .collision_id.count()
                 .reset_index()
                 .sort_values('collision_id', ascending=False)
                 .head(15)
                 .reset_index(drop=True)
                )
veh_cause_df = (cause_df
                 .groupby('veh_cause')
                 .collision_id.count()
                 .reset_index()
                 .sort_values('collision_id', ascending=False)
                 .head(15)
                 .reset_index(drop=True)
                )
In [26]:
veh_cause_df
Out[26]:
veh_cause collision_id
0 Unspecified 12610
1 Driver Inattention/Distraction 5911
2 Failure to Yield Right-of-Way 2852
3 Pedestrian/Bicyclist/Other Pedestrian Error/Co... 1503
4 Passenger Distraction 1372
5 Traffic Control Disregarded 850
6 Other Vehicular 629
7 Passing or Lane Usage Improper 579
8 Physical Disability 421
9 Turning Improperly 359
10 View Obstructed/Limited 314
11 Lost Consciousness 308
12 Following Too Closely 279
13 Driver Inexperience 261
14 Prescription Medication 243
In [27]:
bike_cause_df
Out[27]:
bike_cause collision_id
0 Unspecified 21755
1 Driver Inattention/Distraction 3130
2 Pedestrian/Bicyclist/Other Pedestrian Error/Co... 1203
3 Failure to Yield Right-of-Way 938
4 Passenger Distraction 674
5 Other Vehicular 522
6 Traffic Control Disregarded 471
7 Passing or Lane Usage Improper 434
8 Driver Inexperience 253
9 Passing Too Closely 234
10 Following Too Closely 222
11 Unsafe Speed 197
12 Turning Improperly 163
13 Lost Consciousness 133
14 View Obstructed/Limited 132